#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2010 Marcel Martin <marcel.martin@tu-dortmund.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""%prog [options] <FASTA/FASTQ FILE> [<QUALITY FILE>]

Reads a FASTA or FASTQ file, finds and removes adapters,
and writes the changed sequence to standard output.
When finished, statistics are printed to standard error.

If two file names are given, they are assumed to be
.csfasta and .qual files as produced by the SOLiD sequencer.
(You still need to provide the -c option to correctly deal
with color space.)

If the name of any input or output file ends with '.gz', it is
assumed to be gzip-compressed.

If you want to search for the reverse complement of an adapter, you must
provide an additional adapter sequence.

If the input sequences are in color space, the adapter must
also be provided in color space (using a string of digits 0123).

EXAMPLE

Assuming your sequencing data is available as a FASTQ file, use this
command line:
$ cutadapt -e ERROR-RATE -a ADAPTER-SEQUENCE input.fastq > output.fastq

Use --help to see a description of all available command-line options.
"""

from __future__ import print_function, division

__version__ = '0.6'

import sys
import re
import gzip
import time
from string import maketrans
from optparse import OptionParser
from itertools import izip
from contextlib import closing

from align import semiglobalalign
import fasta


class FormatError(Exception):
	pass


class Error(Exception):
	pass


class UnknownFileType(Exception):
	pass


def xopen(filename, mode='r', is_closing=True):
	"""
	Replacement for the "open" function that can also open
	files that have been compressed with gzip. If the filename ends with .gz,
	the file is opened with gzip.open(). If it doesn't, the regular open()
	is used.
	closing -- whether to wrap the returned GzipFile with contextlib.closing
		to make it usable in 'with' statements. (TODO look for a nicer solution)
	"""
	if filename.endswith('.gz'):
		if is_closing:
			return closing(gzip.open(filename, mode))
		else:
			return gzip.open(filename, mode)
	else:
		return open(filename, mode)


def print_statistics(adapters, stats, outfile=sys.stderr):
	print("Command line parameters:", " ".join(sys.argv[1:]), file=outfile)
	print(stats.n, "reads processed in %.2f seconds, %.2f ms per read." % (stats.time, 1000. * stats.time / stats.n), file=outfile)
	print(stats.reads_changed, "reads were trimmed (%.1f%%)" % (100.*(stats.reads_changed)/stats.n), file=outfile)
	print(file=outfile)
	for adapter_index, adapter in enumerate(adapters):
		print("Statistics for adapter %s (sequence '%s', length %d)" % (adapter_index+1, adapter, len(adapter)), file=outfile)
		print(file=outfile)
		print("adapter found in the beginning of the read", file=outfile)
		print("length", "number", sep="\t", file=outfile)
		total = 0
		for k, v in stats.lengths_front[adapter_index].items():
			print(k, v, sep="\t", file=outfile)
			total += v
		print("total:", total, file=outfile)
		print(file=outfile)
		print(file=outfile)
		print("adapter found at the end of the read", file=outfile)
		print("length", "number", sep="\t", file=outfile)
		total = 0
		for k, v in stats.lengths_back[adapter_index].items():
			print(k, v, sep="\t", file=outfile)
			total += v
		print("total:", total, file=outfile)
		print(file=outfile)
		print(file=outfile)


def quality_to_ascii(quality_line, base=33):
	"""
	Convert a string containing qualities given as integer to a string of ASCII-encoded qualities.

	base -- ASCII code of quality zero (sensible values are 33 and 64).

	>>> quality_to_ascii("17 4 29 18")
	'2%>3'
	"""
	fields = map(int, quality_line.split())
	qualities = ''.join(chr(q+base) for q in fields)
	return qualities


def find_best_alignment(adapters, seq, max_error_rate, minimum_overlap):
	"""
	Find the best matching adapter.

	adapters -- a list of adapter sequences
	seq -- the sequence to which the adapter should be aligned
	max_error_rate -- maximum allowed error rate. The error rate is
		the number of errors in the alignment divided by the length
		of the alignment.

	Return tuple (best_alignment, best_index).

	best_alignment is an alignment as returned by semiglobalalign.
	best_index is the index of the best adapter into the adapters list.
	"""
	best_score = 0
	best_alignment = None
	best_index = None
	for index, adapter in enumerate(adapters):
		alignment = semiglobalalign(adapter, seq)
		(r1, r2, astart, astop, rstart, rstop, errors) = alignment
		length = astop - astart
		if length < minimum_overlap or errors/length > max_error_rate:
			continue

		# the length of the matching part of the adapter minus errors
		# determines which adapter fits best
		score = length - errors
		if score > best_score:
			best_alignment = alignment
			best_score = score
			best_index = index
	return (best_alignment, best_index)


def fastafiletype(fname):
	"""
	Determine file type of fname. Return the string FASTQ or FASTA or
	raise an UnknownFileType exception.
	"""
	with xopen(fname) as f:
		for line in f:
			if line.startswith('#'):
				continue
			if line.startswith('@'):
				return 'FASTQ'
			if line.startswith('>'):
				return 'FASTA'
			raise UnknownFileType("neither FASTQ nor FASTA")


def read_sequences(seqfilename, qualityfilename, colorspace):
	"""
	Read sequences and (if available) quality information from either:
	* seqfilename in FASTA format (qualityfilename must be None)
	* seqfilename in FASTQ format (qualityfilename must be None)
	* seqfilename in .csfasta format and qualityfilename in .qual format (SOLiD color space)

	Return a generator over tuples (description, sequence, qualities).
	qualities is None if no qualities are available.
	"""
	ftype = fastafiletype(seqfilename)

	if ftype == 'FASTQ' and qualityfilename is not None:
		raise Error("If a FASTQ file is given, no quality file can be provided.")

	if ftype == 'FASTQ':
		with xopen(seqfilename) as seqfile:
			for desc, seq, qualities in fasta.readfastq(seqfile, colorspace=colorspace):
				yield (desc, seq, qualities)
	elif ftype == 'FASTA' and qualityfilename is None:
		with xopen(seqfilename) as seqfile:
			for desc, seq in fasta.readfasta(seqfile):
				yield (desc, seq, None)
	else:
		assert ftype == 'FASTA' and qualityfilename is not None
		lengthdiff = 1 if colorspace else 0
		with xopen(seqfilename) as seqfile:
			with xopen(qualityfilename) as qualityfile:
				seq_iter = fasta.readfasta(seqfile)
				qual_iter = fasta.readfasta(qualityfile)
				for (qdesc, qseq), (rdesc, rseq) in izip(qual_iter, seq_iter):
					if qdesc != rdesc:
						raise FormatError("Descriptions in FASTA and quality file don't match (%s and %s)." % (rdesc, qdesc))
					qualities = quality_to_ascii(qseq)
					if len(rseq) == 0 and colorspace:
						raise FormatError("When reading '%s', no sequence was found (at least the initial primer must appear)." % rdesc)
					if len(qualities) != len(rseq) - lengthdiff:
						raise FormatError("While reading '%s': expected to find %d quality values, but found %d." % (rdesc, len(rseq) - lengthdiff, len(qualities)))
					yield rdesc, rseq, qualities


def process_read(desc, seq, qualities, options, stats):
	# In colorspace, the first character is the last nucleotide of the primer base
	# and the second character encodes the transition from the primer base to the
	# first real base of the read.
	if options.trim_primer:
		seq = seq[2:]
		if qualities is not None:
			qualities = qualities[1:]
		initial = ''
	elif options.colorspace:
		initial = seq[0]
		seq = seq[1:]
	else:
		initial = ''

	any_adapter_matches = False

	# try (possibly more than once) to remove an adapter
	for t in xrange(options.times):
		alignment, index = find_best_alignment(options.adapters, seq, options.error_rate, options.overlap)
		if alignment is None:
			break

		(r1, r2, astart, astop, rstart, rstop, errors) = alignment
		length = astop - astart
		assert length > 0
		assert errors/length <= options.error_rate
		assert length - errors > 0

		any_adapter_matches = True
		adapter = options.adapters[index]
		# TODO temporary
		alen = len(adapter)
		rlen = len(seq)

		if rstart == 0 and rstop == len(seq):
			# The adapter or parts of it covers the entire read (case 1)
			assert r1[0] != '-' and r1[-1] != '-'
			assert rstart == 0 and rstop == len(seq)
			seq = ''
			if qualities is not None:
				qualities = ''
		elif rstart > 0:
			# The adapter is at the end of the read (case 2) or in the middle (case 3)
			assert r1[0] == '-'
			assert not (rstart == 0 and rstop == len(seq))
			assert astart == 0
			assert rstart > 0
			assert r2[0] != '-' # since that only happens when the adapter is in the front

			if rstop < rlen:
				# The adapter is in the middle of the read (case 3)
				assert r1[-1] == '-'
				stats.middle += 1
				if options.rest_file is not None:
					print(seq[rstop:], file=options.rest_file)
			# TODO defaultdict?
			stats.lengths_back[index][length] = stats.lengths_back[index].get(length, 0) + 1

			if options.colorspace:
				# trim one more color
				rstart -= 1
			seq = seq[:rstart]
			if qualities is not None:
				qualities = qualities[:rstart]
				assert len(qualities) == len(seq)
		else:
			# The adapter is in the beginning of the read (case 4)
			# TODO should this case be ignored in color space?
			assert r1[-1] == '-'
			assert not (rstart == 0 and rstop == len(seq))
			assert rstart == 0
			if True:
				stats.lengths_front[index][length] = stats.lengths_front[index].get(length, 0) + 1
				seq = seq[rstop:]
				if qualities is not None:
					qualities = qualities[rstop:]
		stats.adapter_matches[index] += 1
		stats.sequence_matches[index] += 1

	if any_adapter_matches:
		stats.reads_changed += 1
	if options.discard and any_adapter_matches:
		return (None, None, None)

	# other modifications to the sequence or its description
	# change any length=XX that may appear in the read description (for 454)
	desc = re.sub(r"\blength=[0-9]*\b", "length=%d" % len(seq), desc)

	if options.strip_f3 and desc.endswith('_F3'):
		desc = desc[:-3]
	desc = options.prefix + desc + options.suffix
	if options.double_encode:
		# convert color space sequence to double-encoded colorspace (using
		# characters ACGTN to represent colors)
		seq = seq.translate(trans)

	if options.colorspace:
		assert len(seq) == len(qualities)
	return (desc, initial + seq, qualities)


def main():
	parser = OptionParser(usage=__doc__, version=__version__)
	parser.add_option("-e", "--error-rate", type=float, default=0.1,
		help="Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: %default)")
	parser.add_option("-o", "--output", default=None,
		help="The modified sequence gets written to this file (in FASTA format) (default: standard output)")
	parser.add_option("-a", "--adapters", action="append",
		help="Adapter sequences (use multiple -a options to search for more than one adapter)")
	parser.add_option("--discard", action='store_true', default=False,
		help="Discard reads that contain the adapter instead of trimming them. Also use -O in order to avoid throwing away too many reads!")
	parser.add_option("-n", "--times", type=int, metavar="COUNT", default=1,
		help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times.")
	parser.add_option("-O", "--overlap", type=int, metavar="LENGTH", default=3,
		help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified.")
	#parser.add_option("--front", action='store_true', default=False,
		#help="Allow adapters to occur in the front of the read.") TODO adapter overlapping the front?
	parser.add_option("-r", "--rest-file", default=None,
		help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file. Use - for standard output.")
	parser.add_option("-x", "--prefix", default='',
		help="Add this prefix to read names")
	parser.add_option("-y", "--suffix", default='',
		help="Add this suffix to read names")
	parser.add_option("-c", "--colorspace", action='store_true', default=False,
		help="Colorspace mode: Trims adapter correctly (one more character than there are matches needs to be trimmed).")
	parser.add_option("-d", "--double-encode", action='store_true', default=False,
		help="When in color space, double-encode colors (mapping 0,1,2,3,4 to A,C,G,T,N).")
	parser.add_option("-t", "--trim-primer", action='store_true', default=False,
		help="When in color space, trim primer base and the transition to the first color")
	parser.add_option("--strip-f3", action='store_true', default=False,
		help="When in color space, strip the _F3 suffix of read names")
	parser.add_option("--maq", "--bwa", action='store_true', default=False,
		help="MAQ/BWA-compatible color space output. This enables -c, -d, -t, --strip-f3 and -y '/1'.")

	options, args = parser.parse_args()

	if len(args) == 0:
		parser.error("At least one parameter needed: name of the FASTA or FASTQ file.")
	elif len(args) > 2:
		parser.error("Too many parameters.")

	infilename = args[0]
	quality_filename = None
	if len(args) == 2:
		quality_filename = args[1]
		print("will use qualities from", quality_filename, file=sys.stderr)

	if options.output is None:
		outfile = sys.stdout
	else:
		outfile = xopen(options.output, 'w', is_closing=False)

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_f3 = True
		options.suffix = "/1"
	if options.trim_primer and not options.colorspace:
		parser.error("trimming the primer makes only sense in colorspace")
	if options.double_encode and not options.colorspace:
		parser.error("double-encoding makes only sense in colorspace")

	if not (0 <= options.error_rate <= 1.):
		parser.error("The maximum error rate must be between 0 and 1.")

	print("maximum error rate: %.2f%%" % (options.error_rate*100.), file=sys.stderr)

	if options.rest_file is not None:
		if options.rest_file == '-':
			options.rest_file = sys.stdout
		else:
			options.rest_file = xopen(options.rest_file, 'w', is_closing=False)

	if not options.adapters:
		print("You need to provide at least one adapter sequence.", file=sys.stderr)
		sys.exit(1)

	class Statistics(object):
		pass
	stats = Statistics()
	stats.reads_changed = 0
	stats.middle = 0
	stats.adapter_matches = len(options.adapters)*[0]
	stats.sequence_matches = len(options.adapters)*[0]
	stats.lengths_front = []
	stats.lengths_back = []
	for x in xrange(len(options.adapters)):
		stats.lengths_front.append({})
	for x in xrange(len(options.adapters)):
		stats.lengths_back.append({})

	# for double-encoding colorspace sequences
	global trans
	trans = maketrans('0123.', 'ACGTN')

	# length difference between sequence and qualities
	lengthdiff = 0 if options.trim_primer else 1

	show_progress = False

	# count the reads in which at least one adapter has been removed
	n = 0
	qualities = None
	start_time = time.clock()
	try:
		for (desc, seq, qualities) in read_sequences(infilename, quality_filename, colorspace=options.colorspace):
			n += 1
			if show_progress and n % 10000 == 0:
				print("Processing read no.", n, file=sys.stderr)

			desc, seq, qualities = process_read(desc, seq, qualities, options, stats)
			if seq is not None:
				# write modified sequence in either FASTA or FASTQ format
				if qualities is None:
					# FASTA
					print('>%s\n%s' % (desc, seq), file=outfile)
				else:
					# FASTQ
					print('@%s\n%s\n+\n%s' % (desc, seq, qualities), file=outfile)
	except FormatError as e:
		print(e, file=sys.stderr)
		return 1

	stats.time = time.clock()
	stats.n = n
	print_statistics(options.adapters, stats)
	return 0


if __name__ == '__main__':
	#import cProfile as profile
	#profile.run('main()', 'cutadapt.prof')
	sys.exit(main())

"""
Interpreting the result of a semiglobal alignment between the adapter and the read sequences
============================================================================================

There are nine cases, which can be grouped into four categories.

A: character of the adapter sequence (r1)
R: character of the read sequence    (r2)
-: gap symbol

The middle of the alignment is always like this:
AAAA
RRRR

For the part of that is before the middle and for the part that is after the middle,
there are three possibilities each (making 3*3=9)

A  and -  and  'nothing'
-      R

Grouping by semantics of those alignments, we get the following three categories.


1. The adapter covers or parts of it cover the entire read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
AAAA
RRRR

b)
AAAA
RR--

c)
AAAA
--RR

1d)
AAAA
-RR-

2. The adapter is at the end of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
---AAA
RRRRRR

b)
---AAAA
RRRRR--

3. The adapter is in the middle of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

---AAA---
RRRRRRRRR

4. The adapter is in the beginning of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
AAAA----
RRRRRRRR

b)
AAAA----
--RRRRRR
"""
